Load results and normalize results for every amplicon

y <- readResultFile("result.csv",FALSE, ",")
targetNames <- y$Target
y$Target <- NULL
resultFileNames <- c("SN1-17_Run_14.xls", "SN1-21_Run_17.xls","SN1-23-Run_18.xls","SN1-25-Run_19.xls", "SN1-26-Run_20.xls")
for (i in 1:length(resultFileNames)){
  res <- readResultFile(resultFileNames[i], FALSE)
  namesOfColumns <- names(res)
  y[namesOfColumns] <- res
  y$Target <- NULL
}
y$Target <- targetNames
y <- y[complete.cases(y[,-1]), ]
y <- y[-53,]
sumsTotal <- rowSums(y[,-ncol(y)])
y[,-ncol(y)] <- log(y[,-ncol(y)])
#sums <- colSums(y[,-ncol(y)])
#y[,-ncol(y)] <- sweep(y[,-ncol(y)], 2, sums, '/' )
sums <- rowSums(y[,-ncol(y)])
y[,-ncol(y)]<- sweep(y[,-ncol(y)], 1, sums, '/' )
y$sum <- sumsTotal

Simple KMeans clustering with a given number of clusters

mydata <- y[, 1:(ncol(y)-2)]
fit <- kmeans(mydata, 4)
#library(cluster) 
#clusplot(mydata, fit$cluster, color=TRUE, shade=TRUE, 
#    labels=2, lines=0)
y$clusterKMeans <- fit$cluster
cat("KMeans given number of clusters: 4\n")
## KMeans given number of clusters: 4

Choose number of clusters using Calinski-Harabasz criterion

library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.2-0
fit <- cascadeKM(scale(mydata, center = TRUE,  scale = TRUE), 1, 4, iter = 1000)
plot(fit, sortg = TRUE, grpmts.plot = TRUE)

calinski.best <- as.numeric(which.max(fit$results[2,]))
cat("Calinski criterion optimal number of clusters:", calinski.best, "\n")
## Calinski criterion optimal number of clusters: 4
y$clusterCalinski <- fit$partition[,calinski.best]

Choose number of clusters using Affinity propagation

library(apcluster)
## 
## Attaching package: 'apcluster'
## 
## The following object is masked from 'package:stats':
## 
##     heatmap
d.apclus <- apcluster(negDistMat(r=2), mydata)
cat("Affinity propagation optimal number of clusters:", length(d.apclus@clusters), "\n")
## Affinity propagation optimal number of clusters: 20
y$clusterAP <- labels(d.apclus, type="enum")

Hierarchical clustering

d_dist <- dist(as.matrix(mydata))
fit <- hclust(d_dist, method="ward") 
plot(fit)
groups <- cutree(fit, k=4)
rect.hclust(fit, k=4, border="red")

y$clusterHierarchy <- groups

Hierarchical clustering with correlation matrix

t.mydata <- t(mydata)
colnames(t.mydata) <- y$Target
data.scale<- scale(t.mydata, center=TRUE, scale=TRUE)
cor_data <- cor(data.scale)
cor_d <- as.dist(cor_data)
fit <- hclust(cor_d, method="ward") 
plot(fit)
groups <- cutree(fit, k=4)
rect.hclust(fit, k=4, border="red")

y$clusterCor <- groups

Load prepared data

prepared_data <- read.table("../data/features.tsv", sep = "\t", header=TRUE)
rownames(prepared_data) <- prepared_data$Target
rownames(y) <- y$Target
valid_names <- intersect(y$Target, prepared_data$Target)
prepared_data <- prepared_data[valid_names,]
y <- y[valid_names,]
### KMeans. Draw features for every cluster
### Calinski criterion. Draw features for every cluster
r data$cluster <- as.factor(y$clusterCalinski) histogram(~data$Amplicon_GC_content|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Fwd_Primer_GC_content|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Rev_Primer_GC_content|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Amplicon_Len|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Fwd_Primer_Len|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Rev_Primer_Len|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Amplicon_MaxPolyX|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Fwd_Primer_MaxPolyX|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Rev_Primer_MaxPolyX|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Amplicon_Entropy|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Fwd_Primer_Entropy|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Rev_Primer_Entropy|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Amplicon_Hash|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Fwd_Primer_Hash|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Rev_Primer_Hash|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Fwd_END_3|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Rev_END_3|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Start_Dangling_End|data$cluster, main = "KMeans.Calinski-Harabasz criterion")
r histogram(~data$Stop_Dangling_End|data$cluster, main = "KMeans.Calinski-Harabasz criterion")

Hierarchical clustering. Draw features for every cluster

data$cluster <- as.factor(y$clusterHierarchy)
histogram(~data$Amplicon_GC_content|data$cluster, main = "Hierarchical clustering")

histogram(~data$Fwd_Primer_GC_content|data$cluster, main = "Hierarchical clustering")

histogram(~data$Rev_Primer_GC_content|data$cluster, main = "Hierarchical clustering")

histogram(~data$Amplicon_Len|data$cluster, main = "Hierarchical clustering")

histogram(~data$Fwd_Primer_Len|data$cluster, main = "Hierarchical clustering")

histogram(~data$Rev_Primer_Len|data$cluster, main = "Hierarchical clustering")

histogram(~data$Amplicon_MaxPolyX|data$cluster, main = "Hierarchical clustering")

histogram(~data$Fwd_Primer_MaxPolyX|data$cluster, main = "Hierarchical clustering")

histogram(~data$Rev_Primer_MaxPolyX|data$cluster, main = "Hierarchical clustering")

histogram(~data$Amplicon_Entropy|data$cluster, main = "Hierarchical clustering")

histogram(~data$Fwd_Primer_Entropy|data$cluster, main = "Hierarchical clustering")

histogram(~data$Rev_Primer_Entropy|data$cluster, main = "Hierarchical clustering")

histogram(~data$Amplicon_Hash|data$cluster, main = "Hierarchical clustering")

histogram(~data$Fwd_Primer_Hash|data$cluster, main = "Hierarchical clustering")

histogram(~data$Rev_Primer_Hash|data$cluster, main = "Hierarchical clustering")

histogram(~data$Fwd_END_3|data$cluster, main = "Hierarchical clustering")

histogram(~data$Rev_END_3|data$cluster, main = "Hierarchical clustering")

histogram(~data$Start_Dangling_End|data$cluster, main = "Hierarchical clustering")

histogram(~data$Stop_Dangling_End|data$cluster, main = "Hierarchical clustering")

Correlation clustering. Draw features for every cluster

data$cluster <- as.factor(y$clusterCor)
histogram(~data$Amplicon_GC_content|data$cluster, main = "Correlation clustering")

histogram(~data$Fwd_Primer_GC_content|data$cluster, main = "Correlation clustering")

histogram(~data$Rev_Primer_GC_content|data$cluster, main = "Correlation clustering")

histogram(~data$Amplicon_Len|data$cluster, main = "Correlation clustering")

histogram(~data$Fwd_Primer_Len|data$cluster, main = "Correlation clustering")

histogram(~data$Rev_Primer_Len|data$cluster, main = "Correlation clustering")

histogram(~data$Amplicon_MaxPolyX|data$cluster, main = "Correlation clustering")

histogram(~data$Fwd_Primer_MaxPolyX|data$cluster, main = "Correlation clustering")

histogram(~data$Rev_Primer_MaxPolyX|data$cluster, main = "Correlation clustering")

histogram(~data$Amplicon_Entropy|data$cluster, main = "Correlation clustering")

histogram(~data$Fwd_Primer_Entropy|data$cluster, main = "Correlation clustering")

histogram(~data$Rev_Primer_Entropy|data$cluster, main = "Correlation clustering")

histogram(~data$Amplicon_Hash|data$cluster, main = "Correlation clustering")

histogram(~data$Fwd_Primer_Hash|data$cluster, main = "Correlation clustering")

histogram(~data$Rev_Primer_Hash|data$cluster, main = "Correlation clustering")

histogram(~data$Fwd_END_3|data$cluster, main = "Correlation clustering")

histogram(~data$Rev_END_3|data$cluster, main = "Correlation clustering")

histogram(~data$Start_Dangling_End|data$cluster, main = "Correlation clustering")

histogram(~data$Stop_Dangling_End|data$cluster, main = "Correlation clustering")

—–

KMeans. Draw features for every cluster

library(lattice)
data <- prepared_data
data$cluster <- as.factor(y$clusterKMeans)
histogram(data$cluster)

amplicon_gc_content <- getFactoredDataTrunc(data$Amplicon_GC_content)
##      25%      50%      75%     100% 
## 33.77483 41.11111 51.26050 70.21277
counts <- table(data$cluster, amplicon_gc_content)
barplot(counts, main="Amplicon Distribution by Amplicon GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_gc_content <- getFactoredDataTrunc(data$Fwd_Primer_GC_content)
##      25%      50%      75%     100% 
## 38.46154 43.47826 52.17391 64.70588
counts <- table(data$cluster, fwd_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Fwd Primer GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_gc_content <- getFactoredDataTrunc(data$Rev_Primer_GC_content)
##      25%      50%      75%     100% 
## 38.46154 42.30769 47.82609 63.15789
counts <- table(data$cluster, rev_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Rev Primer GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_len <- getFactoredDataTrunc(data$Amplicon_Len)
##  25%  50%  75% 100% 
##  114  151  170  188
counts <- table(data$cluster, amplicon_len)
barplot(counts, main="Amplicon Distribution by Amplicon Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_len <- getFactoredDataTrunc(data$Fwd_Primer_Len)
##  25%  50%  75% 100% 
##   22   25   28   33
counts <- table(data$cluster, fwd_primer_len)
barplot(counts, main="Amplicon Distribution by Fwd Primer Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_len <- getFactoredDataTrunc(data$Rev_Primer_Len)
##  25%  50%  75% 100% 
##   23   26   28   33
counts <- table(data$cluster, rev_primer_len)
barplot(counts, main="Amplicon Distribution by Rev Primer Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_MaxPolyX <- getFactoredDataTrunc(data$Amplicon_MaxPolyX)
##  25%  50%  75% 100% 
##    3    4    4    8
counts <- table(data$cluster, amplicon_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Amplicon MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_MaxPolyX <- getFactoredDataTrunc(data$Fwd_Primer_MaxPolyX)
##  25%  50%  75% 100% 
##    2    2    3    4
counts <- table(data$cluster, fwd_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Fwd Primer MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_MaxPolyX <- getFactoredDataTrunc(data$Rev_Primer_MaxPolyX)
##  25%  50%  75% 100% 
##    2    2    3    4
counts <- table(data$cluster, rev_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Rev Primer MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_Entropy <- getFactoredData(data$Amplicon_Entropy)
##       25%       50%       75%      100% 
## 0.3420570 0.3456354 0.3482220 0.3541775
counts <- table(data$cluster, amplicon_Entropy)
barplot(counts, main="Amplicon Distribution by Amplicon Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_Entropy <- getFactoredData(data$Fwd_Primer_Entropy)
##       25%       50%       75%      100% 
## 0.3417350 0.3448008 0.3465696 0.3505145
counts <- table(data$cluster, fwd_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Fwd Primer Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_Entropy <- getFactoredData(data$Rev_Primer_Entropy)
##       25%       50%       75%      100% 
## 0.3432679 0.3452135 0.3465696 0.3505145
counts <- table(data$cluster, rev_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Rev Primer Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_Hash <- getFactoredDataTrunc(data$Amplicon_Hash)
##         25%         50%         75%        100% 
## 18992277214 34196138165 48742458429 62929952416
counts <- table(data$cluster, amplicon_Hash)
barplot(counts, main="Amplicon Distribution by Amplicon Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_Hash <- getFactoredDataTrunc(data$Fwd_Primer_Hash)
##         25%         50%         75%        100% 
## 13668885842 26663877899 44900388165 62098312721
counts <- table(data$cluster, fwd_primer_Hash)
barplot(counts, main="Amplicon Distribution by Fwd Primer Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_Hash <- getFactoredDataTrunc(data$Rev_Primer_Hash)
##         25%         50%         75%        100% 
## 18803378911 33335803072 48501833706 62915016535
counts <- table(data$cluster, rev_primer_Hash)
barplot(counts, main="Amplicon Distribution by Rev Primer Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

histogram(~data$cluster|data$Fwd_END_3, main = "KMeans")

histogram(~data$cluster|data$Rev_END_3, main = "KMeans")

histogram(~data$cluster|data$Start_Dangling_End, main = "KMeans")

histogram(~data$cluster|data$Stop_Dangling_End, main = "KMeans")

Calinski criterion. Draw features for every cluster

data$cluster <- as.factor(y$clusterCalinski)
histogram(data$cluster)

amplicon_gc_content <- getFactoredDataTrunc(data$Amplicon_GC_content)
##      25%      50%      75%     100% 
## 33.77483 41.11111 51.26050 70.21277
counts <- table(data$cluster, amplicon_gc_content)
barplot(counts, main="Amplicon Distribution by Amplicon GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_gc_content <- getFactoredDataTrunc(data$Fwd_Primer_GC_content)
##      25%      50%      75%     100% 
## 38.46154 43.47826 52.17391 64.70588
counts <- table(data$cluster, fwd_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Fwd Primer GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_gc_content <- getFactoredDataTrunc(data$Rev_Primer_GC_content)
##      25%      50%      75%     100% 
## 38.46154 42.30769 47.82609 63.15789
counts <- table(data$cluster, rev_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Rev Primer GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_len <- getFactoredDataTrunc(data$Amplicon_Len)
##  25%  50%  75% 100% 
##  114  151  170  188
counts <- table(data$cluster, amplicon_len)
barplot(counts, main="Amplicon Distribution by Amplicon Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_len <- getFactoredDataTrunc(data$Fwd_Primer_Len)
##  25%  50%  75% 100% 
##   22   25   28   33
counts <- table(data$cluster, fwd_primer_len)
barplot(counts, main="Amplicon Distribution by Fwd Primer Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_len <- getFactoredDataTrunc(data$Rev_Primer_Len)
##  25%  50%  75% 100% 
##   23   26   28   33
counts <- table(data$cluster, rev_primer_len)
barplot(counts, main="Amplicon Distribution by Rev Primer Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_MaxPolyX <- getFactoredDataTrunc(data$Amplicon_MaxPolyX)
##  25%  50%  75% 100% 
##    3    4    4    8
counts <- table(data$cluster, amplicon_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Amplicon MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_MaxPolyX <- getFactoredDataTrunc(data$Fwd_Primer_MaxPolyX)
##  25%  50%  75% 100% 
##    2    2    3    4
counts <- table(data$cluster, fwd_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Fwd Primer MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_MaxPolyX <- getFactoredDataTrunc(data$Rev_Primer_MaxPolyX)
##  25%  50%  75% 100% 
##    2    2    3    4
counts <- table(data$cluster, rev_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Rev Primer MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_Entropy <- getFactoredData(data$Amplicon_Entropy)
##       25%       50%       75%      100% 
## 0.3420570 0.3456354 0.3482220 0.3541775
counts <- table(data$cluster, amplicon_Entropy)
barplot(counts, main="Amplicon Distribution by Amplicon Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_Entropy <- getFactoredData(data$Fwd_Primer_Entropy)
##       25%       50%       75%      100% 
## 0.3417350 0.3448008 0.3465696 0.3505145
counts <- table(data$cluster, fwd_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Fwd Primer Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_Entropy <- getFactoredData(data$Rev_Primer_Entropy)
##       25%       50%       75%      100% 
## 0.3432679 0.3452135 0.3465696 0.3505145
counts <- table(data$cluster, rev_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Rev Primer Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_Hash <- getFactoredDataTrunc(data$Amplicon_Hash)
##         25%         50%         75%        100% 
## 18992277214 34196138165 48742458429 62929952416
counts <- table(data$cluster, amplicon_Hash)
barplot(counts, main="Amplicon Distribution by Amplicon Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_Hash <- getFactoredDataTrunc(data$Fwd_Primer_Hash)
##         25%         50%         75%        100% 
## 13668885842 26663877899 44900388165 62098312721
counts <- table(data$cluster, fwd_primer_Hash)
barplot(counts, main="Amplicon Distribution by Fwd Primer Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_Hash <- getFactoredDataTrunc(data$Rev_Primer_Hash)
##         25%         50%         75%        100% 
## 18803378911 33335803072 48501833706 62915016535
counts <- table(data$cluster, rev_primer_Hash)
barplot(counts, main="Amplicon Distribution by Rev Primer Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

histogram(~data$cluster|data$Fwd_END_3, main =  "KMeans.Calinski-Harabasz criterion")

histogram(~data$cluster|data$Rev_END_3, main =  "KMeans.Calinski-Harabasz criterion")

histogram(~data$cluster|data$Start_Dangling_End, main =  "KMeans.Calinski-Harabasz criterion")

histogram(~data$cluster|data$Stop_Dangling_End, main =  "KMeans.Calinski-Harabasz criterion")

Hierarchical clustering. Draw features for every cluster

data$cluster <- as.factor(y$clusterHierarchy)
histogram(data$cluster)

amplicon_gc_content <- getFactoredDataTrunc(data$Amplicon_GC_content)
##      25%      50%      75%     100% 
## 33.77483 41.11111 51.26050 70.21277
counts <- table(data$cluster, amplicon_gc_content)
barplot(counts, main="Amplicon Distribution by Amplicon GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_gc_content <- getFactoredDataTrunc(data$Fwd_Primer_GC_content)
##      25%      50%      75%     100% 
## 38.46154 43.47826 52.17391 64.70588
counts <- table(data$cluster, fwd_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Fwd Primer GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_gc_content <- getFactoredDataTrunc(data$Rev_Primer_GC_content)
##      25%      50%      75%     100% 
## 38.46154 42.30769 47.82609 63.15789
counts <- table(data$cluster, rev_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Rev Primer GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_len <- getFactoredDataTrunc(data$Amplicon_Len)
##  25%  50%  75% 100% 
##  114  151  170  188
counts <- table(data$cluster, amplicon_len)
barplot(counts, main="Amplicon Distribution by Amplicon Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_len <- getFactoredDataTrunc(data$Fwd_Primer_Len)
##  25%  50%  75% 100% 
##   22   25   28   33
counts <- table(data$cluster, fwd_primer_len)
barplot(counts, main="Amplicon Distribution by Fwd Primer Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_len <- getFactoredDataTrunc(data$Rev_Primer_Len)
##  25%  50%  75% 100% 
##   23   26   28   33
counts <- table(data$cluster, rev_primer_len)
barplot(counts, main="Amplicon Distribution by Rev Primer Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_MaxPolyX <- getFactoredDataTrunc(data$Amplicon_MaxPolyX)
##  25%  50%  75% 100% 
##    3    4    4    8
counts <- table(data$cluster, amplicon_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Amplicon MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_MaxPolyX <- getFactoredDataTrunc(data$Fwd_Primer_MaxPolyX)
##  25%  50%  75% 100% 
##    2    2    3    4
counts <- table(data$cluster, fwd_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Fwd Primer MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_MaxPolyX <- getFactoredDataTrunc(data$Rev_Primer_MaxPolyX)
##  25%  50%  75% 100% 
##    2    2    3    4
counts <- table(data$cluster, rev_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Rev Primer MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_Entropy <- getFactoredData(data$Amplicon_Entropy)
##       25%       50%       75%      100% 
## 0.3420570 0.3456354 0.3482220 0.3541775
counts <- table(data$cluster, amplicon_Entropy)
barplot(counts, main="Amplicon Distribution by Amplicon Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_Entropy <- getFactoredData(data$Fwd_Primer_Entropy)
##       25%       50%       75%      100% 
## 0.3417350 0.3448008 0.3465696 0.3505145
counts <- table(data$cluster, fwd_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Fwd Primer Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_Entropy <- getFactoredData(data$Rev_Primer_Entropy)
##       25%       50%       75%      100% 
## 0.3432679 0.3452135 0.3465696 0.3505145
counts <- table(data$cluster, rev_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Rev Primer Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_Hash <- getFactoredDataTrunc(data$Amplicon_Hash)
##         25%         50%         75%        100% 
## 18992277214 34196138165 48742458429 62929952416
counts <- table(data$cluster, amplicon_Hash)
barplot(counts, main="Amplicon Distribution by Amplicon Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_Hash <- getFactoredDataTrunc(data$Fwd_Primer_Hash)
##         25%         50%         75%        100% 
## 13668885842 26663877899 44900388165 62098312721
counts <- table(data$cluster, fwd_primer_Hash)
barplot(counts, main="Amplicon Distribution by Fwd Primer Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_Hash <- getFactoredDataTrunc(data$Rev_Primer_Hash)
##         25%         50%         75%        100% 
## 18803378911 33335803072 48501833706 62915016535
counts <- table(data$cluster, rev_primer_Hash)
barplot(counts, main="Amplicon Distribution by Rev Primer Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

histogram(~data$cluster|data$Fwd_END_3, main = "Hierarchical clustering")

histogram(~data$cluster|data$Rev_END_3, main = "Hierarchical clustering")

histogram(~data$cluster|data$Start_Dangling_End, main = "Hierarchical clustering")

histogram(~data$cluster|data$Stop_Dangling_End, main = "Hierarchical clustering")

Correlation clustering. Draw features for every cluster

data$cluster <- as.factor(y$clusterCor)
histogram(data$cluster)

amplicon_gc_content <- getFactoredDataTrunc(data$Amplicon_GC_content)
##      25%      50%      75%     100% 
## 33.77483 41.11111 51.26050 70.21277
counts <- table(data$cluster, amplicon_gc_content)
barplot(counts, main="Amplicon Distribution by Amplicon GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_gc_content <- getFactoredDataTrunc(data$Fwd_Primer_GC_content)
##      25%      50%      75%     100% 
## 38.46154 43.47826 52.17391 64.70588
counts <- table(data$cluster, fwd_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Fwd Primer GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_gc_content <- getFactoredDataTrunc(data$Rev_Primer_GC_content)
##      25%      50%      75%     100% 
## 38.46154 42.30769 47.82609 63.15789
counts <- table(data$cluster, rev_primer_gc_content)
barplot(counts, main="Amplicon Distribution by Rev Primer GC-content and Clusters",
  xlab="GC-content", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_len <- getFactoredDataTrunc(data$Amplicon_Len)
##  25%  50%  75% 100% 
##  114  151  170  188
counts <- table(data$cluster, amplicon_len)
barplot(counts, main="Amplicon Distribution by Amplicon Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_len <- getFactoredDataTrunc(data$Fwd_Primer_Len)
##  25%  50%  75% 100% 
##   22   25   28   33
counts <- table(data$cluster, fwd_primer_len)
barplot(counts, main="Amplicon Distribution by Fwd Primer Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_len <- getFactoredDataTrunc(data$Rev_Primer_Len)
##  25%  50%  75% 100% 
##   23   26   28   33
counts <- table(data$cluster, rev_primer_len)
barplot(counts, main="Amplicon Distribution by Rev Primer Length and Clusters",
  xlab="Length", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_MaxPolyX <- getFactoredDataTrunc(data$Amplicon_MaxPolyX)
##  25%  50%  75% 100% 
##    3    4    4    8
counts <- table(data$cluster, amplicon_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Amplicon MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_MaxPolyX <- getFactoredDataTrunc(data$Fwd_Primer_MaxPolyX)
##  25%  50%  75% 100% 
##    2    2    3    4
counts <- table(data$cluster, fwd_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Fwd Primer MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_MaxPolyX <- getFactoredDataTrunc(data$Rev_Primer_MaxPolyX)
##  25%  50%  75% 100% 
##    2    2    3    4
counts <- table(data$cluster, rev_primer_MaxPolyX)
barplot(counts, main="Amplicon Distribution by Rev Primer MaxPolyX and Clusters",
  xlab="MaxPolyX", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_Entropy <- getFactoredData(data$Amplicon_Entropy)
##       25%       50%       75%      100% 
## 0.3420570 0.3456354 0.3482220 0.3541775
counts <- table(data$cluster, amplicon_Entropy)
barplot(counts, main="Amplicon Distribution by Amplicon Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_Entropy <- getFactoredData(data$Fwd_Primer_Entropy)
##       25%       50%       75%      100% 
## 0.3417350 0.3448008 0.3465696 0.3505145
counts <- table(data$cluster, fwd_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Fwd Primer Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_Entropy <- getFactoredData(data$Rev_Primer_Entropy)
##       25%       50%       75%      100% 
## 0.3432679 0.3452135 0.3465696 0.3505145
counts <- table(data$cluster, rev_primer_Entropy)
barplot(counts, main="Amplicon Distribution by Rev Primer Entropy and Clusters",
  xlab="Entropy", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

amplicon_Hash <- getFactoredDataTrunc(data$Amplicon_Hash)
##         25%         50%         75%        100% 
## 18992277214 34196138165 48742458429 62929952416
counts <- table(data$cluster, amplicon_Hash)
barplot(counts, main="Amplicon Distribution by Amplicon Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

fwd_primer_Hash <- getFactoredDataTrunc(data$Fwd_Primer_Hash)
##         25%         50%         75%        100% 
## 13668885842 26663877899 44900388165 62098312721
counts <- table(data$cluster, fwd_primer_Hash)
barplot(counts, main="Amplicon Distribution by Fwd Primer Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

rev_primer_Hash <- getFactoredDataTrunc(data$Rev_Primer_Hash)
##         25%         50%         75%        100% 
## 18803378911 33335803072 48501833706 62915016535
counts <- table(data$cluster, rev_primer_Hash)
barplot(counts, main="Amplicon Distribution by Rev Primer Hash and Clusters",
  xlab="Hash", col=c("darkblue","red","green","blue"),
   legend = rownames(counts))

histogram(~data$cluster|data$Fwd_END_3, main = "Correlation clustering")

histogram(~data$cluster|data$Rev_END_3, main = "Correlation clustering")

histogram(~data$cluster|data$Start_Dangling_End, main = "Correlation clustering")

histogram(~data$cluster|data$Stop_Dangling_End, main = "Correlation clustering")

Amplification results